{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90abb800",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import geopandas as gpd\n",
    "import shapely\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import folium\n",
    "import folium.plugins as fp\n",
    "from openai import OpenAI\n",
    "client = OpenAI(api_key='api_key')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e74501e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_gdf = gpd.read_file(\"Data/ONS_BUA_2022_GB_BGG.gpkg\").to_crs(\"EPSG:4326\")\n",
    "bua_gdf_ew = bua_gdf[bua_gdf['BUA22CD'].str[0] != \"S\"].reset_index(drop=True).drop(columns=['BUA22NMW', 'BUA22NMG', 'BNG_E', 'BNG_N'])\n",
    "bua_gdf_ew['area_ha_2dp'] = round((bua_gdf_ew.to_crs('EPSG:27700')['geometry'].area / 10000), 2)\n",
    "ons_characteristics_data = {}\n",
    "for i in range(1,11): # load in and concantenate data for England and Wales from ONS BUA characteristics data from 2021\n",
    "    ons_characteristics_data['characteristic_data_' + str(i)] = pd.concat((pd.read_excel(\"Data/townsandcitiescharacteristicsofbuiltupareasenglandandwalescensus2021.xlsx\", sheet_name=(str(i)+\"c\"), skiprows=2), (pd.read_excel(\"Data/townsandcitiescharacteristicsofbuiltupareasenglandandwalescensus2021.xlsx\", sheet_name=(str(i)+\"d\"), skiprows=2))), ignore_index=True)\n",
    "bua_gdf_ew_pop = bua_gdf_ew.merge(ons_characteristics_data['characteristic_data_1'], how='inner', left_on='BUA22CD', right_on='BUA code').drop(columns=['Country code', 'Region code', 'BUA22CD', 'BUA22NM', 'LONG', 'LAT'])\n",
    "bua_gdf_ew_pop.columns = ['geometry', 'Area in ha (2dp)', 'Country', 'Region', 'BUA Code', 'BUA Name', 'BUA Size Classification', '2021 Resident Population']\n",
    "pubs_bua_data = pd.read_excel(\"Data/accesstoamenitiespubs.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,3]]\n",
    "worship_bua_data = pd.read_excel(\"Data/accesstoamenitiesreligiousworship.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,16]]\n",
    "communities_bua_data = pd.read_excel(\"Data/accesstoamenitiescommunityfacilities.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,8]]\n",
    "bua_other_characteristics_df = pubs_bua_data.merge(worship_bua_data, how='outer', on='BUA code').merge(communities_bua_data, how='outer', on='BUA code').iloc[:,[0,1,2,3,5,6,8,9]]\n",
    "bua_other_characteristics_df.columns = ['BUA Code', 'BUA Name', 'Pub Count', 'Pubs per 100,000 people', 'Religious Worship Sites Count', 'Religious Worship Sites per 100,000 people', 'Community Facility Count', 'Community Facilities per 100,000 people']\n",
    "greenspace_bua_data = pd.read_excel(\"Data/buafunctionalgreenspacearea.xlsx\", sheet_name=\"Table 2\", skiprows=4).iloc[:,[0,1,2,3,4]]\n",
    "greenspace_bua_data.columns = ['BUA Code', 'BUA Name', 'Area (m2)', 'Functional greenspace area (m2)', 'Functional greenspace area percentage (%)']\n",
    "bua_other_characteristics_df = greenspace_bua_data.merge(bua_other_characteristics_df.iloc[:,1:], how='left', on='BUA Name')\n",
    "bua_gdf_ew_some_data = bua_gdf_ew_pop.merge(bua_other_characteristics_df, how='left', on=['BUA Code', 'BUA Name']) # will give other characteristics for Medium or larger BUA size classification BUAs\n",
    "bua_gdf_ew_some_data['Area in ha percentile (%)'] = bua_gdf_ew_some_data['Area in ha (2dp)'].rank(pct=True) * 100.00\n",
    "counties_gdf = gpd.read_file(\"Data/Upper_Tier_Local_Authorities_December_2022_Boundaries_UK_BFC.gpkg\").to_crs(\"EPSG:4326\") # Source: https://geoportal.statistics.gov.uk/datasets/f6c95adbbc2949b5a63fd97833562d2e_0/explore\n",
    "centroid_data = gpd.read_file(\"Data/ONS_BUA_2022_GB_BGG.gpkg\").to_crs(\"EPSG:4326\")[['BUA22CD', 'LONG', 'LAT']]\n",
    "centroids = gpd.GeoDataFrame(data=centroid_data, geometry=gpd.points_from_xy(centroid_data['LONG'], centroid_data['LAT']), crs=\"EPSG:4326\")\n",
    "bua_counties_df = centroids.sjoin(counties_gdf[['UTLA22NM','geometry']], how='inner', predicate='covered_by')[['BUA22CD','UTLA22NM']]\n",
    "bua_counties_df.columns = ['BUA Code', 'County Name']\n",
    "bua_2022_gdf = bua_gdf_ew_some_data.merge(bua_counties_df, how='inner', on='BUA Code')\n",
    "bua_2022_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5203ed74",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf['BUA Size Classification'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56bd23cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_constitutional_coord_for_place(place, subregion, region, completion_notifications=True):\n",
    "\n",
    "#   global ai_coord_list_text\n",
    "\n",
    "#   input_prompt_text = f\"State a location coordinate of minimum and maximum 1 lat/long point where the centre of the place of {place} in {subregion}, {region}, UK constitutes. \\\n",
    "#         A range of official or unofficial information can inform the final judgement. Guess if necessary. \\\n",
    "#             Present this final response without any other text at all, in a consistent standard numerical text format, as a text lat/long pair. \\\n",
    "#                 Respond as 1 coordinate as a text-written Python tuple, strictly without bounding brackets, in the format: long,lat\"\n",
    "  \n",
    "#   def run_ai_api_query():\n",
    "#     response = client.responses.create(\n",
    "#       model=\"gpt-4o-mini\",\n",
    "#       input=input_prompt_text,\n",
    "#       store=True,\n",
    "#       service_tier=\"priority\"\n",
    "#     )\n",
    "\n",
    "#     ai_coord_list_text = response.output_text\n",
    "\n",
    "#     return ai_coord_list_text\n",
    "  \n",
    "#   while True:\n",
    "#     try:\n",
    "#       ai_coord_list_coords = run_ai_api_query()\n",
    "#       break\n",
    "#     except:\n",
    "#       print(\"Trying again...\")\n",
    "#       continue\n",
    "  \n",
    "  \n",
    "#   if completion_notifications:\n",
    "#     print(f\"Finished processing for {place} in {region}\")\n",
    "  \n",
    "#   return ai_coord_list_coords\n",
    "\n",
    "# def run_polygons_fetch_routine(candidate_places_gdf):\n",
    "#   point_geometry_responses_list = []\n",
    "#   for place_i in range(0,len(bua_2022_gdf)):\n",
    "#     place = bua_2022_gdf.iloc[place_i]['BUA Name']\n",
    "#     subregion = bua_2022_gdf.iloc[place_i]['County Name']\n",
    "#     region = bua_2022_gdf.iloc[place_i]['Country']\n",
    "#     text_response = get_constitutional_coord_for_place(place, subregion, region).strip(\"'\").strip('(').strip(')').split(\",\")\n",
    "#     try:\n",
    "#       point_value = shapely.geometry.Point(np.float64(text_response[0]), np.float64(text_response[1]))\n",
    "#     except:\n",
    "#       point_value = None\n",
    "#     point_geometry_responses_list += [point_value] # long_value, then lat_value, being fed in\n",
    "#   return point_geometry_responses_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1c725fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# def build_pre_vis_gdf(point_geometry_responses_list):\n",
    "#     global bua_2022_gdf_regional_explore\n",
    "#     bua_2022_gdf['place_point_perception_openai'] = point_geometry_responses_list\n",
    "#     bua_2022_gdf_regional_explore = bua_2022_gdf[['BUA Name', 'place_point_perception_openai', 'geometry']]\n",
    "#     bua_2022_gdf_regional_explore.columns = [\"PLACE\", \"OPENAI PLACE POINT PERCEPTION\", \"geometry\"]\n",
    "#     pd.DataFrame(bua_2022_gdf_regional_explore).to_csv(\"2510_all_buas_place_location_study_data_gpt_4o_mini.csv\") # saving is here (optional)\n",
    "#     return None\n",
    "\n",
    "# point_responses_list_gpt4o_mini = run_polygons_fetch_routine(bua_2022_gdf)\n",
    "# build_pre_vis_gdf(point_responses_list_gpt4o_mini)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33127850",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bua_2022_gdf_regional_explore.set_geometry('OPENAI PLACE POINT PERCEPTION').set_crs('EPSG:4326').explore(tiles='CartoDB.Positron').save(\"point_map_test.html\")\n",
    "# bua_2022_gdf_regional_explore.loc[:,'official_bua_centroid_point'] = bua_2022_gdf_regional_explore.to_crs('EPSG:3857').geometry.centroid\n",
    "# bua_2022_gdf_regional_explore.loc[:,'distance'] = bua_2022_gdf_regional_explore.to_crs('EPSG:32630').distance(bua_2022_gdf_regional_explore.set_geometry('OPENAI PLACE POINT PERCEPTION').set_crs('EPSG:4326').to_crs('EPSG:32630'))\n",
    "# bua_2022_gdf_regional_explore = bua_2022_gdf_regional_explore.set_geometry('official_bua_centroid_point').to_crs('EPSG:4326')\n",
    "# bua_2022_gdf_regional_explore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5214369",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bua_2022_gdf[bua_2022_gdf['BUA Name']==\"Ambleside\"].reset_index(drop=True)#['BUA Name'].unique().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8df4aea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bua_2022_gdf_regional_explore.to_csv(\"bua_point_llm_output_gdf.csv\")\n",
    "\n",
    "from shapely import wkt\n",
    "\n",
    "saved_file_df = gpd.read_file(\"bua_point_llm_output_gdf.csv\")[['PLACE','OPENAI PLACE POINT PERCEPTION', 'geometry', 'official_bua_centroid_point', 'distance']]\n",
    "\n",
    "def clean_wkt_text(x):\n",
    "    if pd.isna(x):\n",
    "        return None\n",
    "    t = str(x).strip()\n",
    "    if t.lower() in {\"\", \"null\", \"none\", \"nan\"}:\n",
    "        return None\n",
    "    if (t.startswith('\"') and t.endswith('\"')) or (t.startswith(\"'\") and t.endswith(\"'\")):\n",
    "        t = t[1:-1].strip()\n",
    "    # remove BOM and control chars\n",
    "    t = t.encode(\"utf-8\", \"ignore\").decode(\"utf-8\").replace(\"\\ufeff\", \"\")\n",
    "    return t\n",
    "\n",
    "s_clean = saved_file_df['OPENAI PLACE POINT PERCEPTION'].apply(clean_wkt_text)\n",
    "saved_file_df[\"OPENAI PLACE POINT PERCEPTION\"] = s_clean\n",
    "s_clean_2 = saved_file_df['geometry'].apply(clean_wkt_text)\n",
    "saved_file_df[\"geometry\"] = s_clean_2\n",
    "s_clean_3 = saved_file_df['official_bua_centroid_point'].apply(clean_wkt_text)\n",
    "saved_file_df[\"official_bua_centroid_point\"] = s_clean_3\n",
    "saved_file_df[\"OPENAI PLACE POINT PERCEPTION\"] = saved_file_df[\"OPENAI PLACE POINT PERCEPTION\"].apply(shapely.wkt.loads)\n",
    "saved_file_df[\"geometry\"] = saved_file_df[\"geometry\"].apply(shapely.wkt.loads)\n",
    "saved_file_df[\"official_bua_centroid_point\"] = saved_file_df[\"official_bua_centroid_point\"].apply(shapely.wkt.loads)\n",
    "saved_file_gdf = gpd.GeoDataFrame(data=saved_file_df, geometry='OPENAI PLACE POINT PERCEPTION', crs='EPSG:4326')\n",
    "\n",
    "bua_2022_gdf_regional_explore = saved_file_gdf.copy()\n",
    "bua_2022_gdf_regional_explore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4daaf83",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf = bua_2022_gdf_regional_explore.copy()\n",
    "concat_gdf['OPENAI PLACE POINT PERCEPTION'] = concat_gdf['OPENAI PLACE POINT PERCEPTION'].to_crs('EPSG:32630')\n",
    "concat_gdf = concat_gdf.set_geometry('official_bua_centroid_point').set_crs('EPSG:4326')\n",
    "concat_gdf = concat_gdf.to_crs('EPSG:32630')\n",
    "concat_gdf['distance'] = concat_gdf['OPENAI PLACE POINT PERCEPTION'].distance(concat_gdf['official_bua_centroid_point'])\n",
    "# concat_gdf['distance'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25bdbe0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4408ce66",
   "metadata": {},
   "outputs": [],
   "source": [
    "reliability_distance_scores = concat_gdf.copy()[['PLACE','distance']]\n",
    "reliability_distance_scores.to_csv(\"2512_distance_scores_per_bua_final.csv\", index=False)\n",
    "reliability_distance_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ab7e768",
   "metadata": {},
   "outputs": [],
   "source": [
    "# concat_gdf['distance'].describe().reset_index().to_csv(\"2512_domesday_distance_results_overall_table.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb5e35eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_sized = concat_gdf.copy().merge(bua_2022_gdf[['BUA Name', 'BUA Size Classification']], how='left', left_on='PLACE', right_on='BUA Name')\n",
    "describe_table_error_distances_by_bua_size_2dp = concat_gdf_sized.groupby(['BUA Size Classification'])['distance'].describe().reset_index().round(2)\n",
    "describe_table_error_distances_by_bua_size_2dp.sort_values('count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a83efa49",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_sized['distance'].describe().round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "432737de",
   "metadata": {},
   "outputs": [],
   "source": [
    "# describe_table_error_distances_by_bua_size_2dp.to_csv(\"2512_domesday_distance_results_breakdown_table_per_bua_size_class.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eeffdcce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# concat_gdf_sized[concat_gdf_sized['distance'].isna()].to_csv(\"2512_domesday_failed_openai_bua_points_ie_no_distances_available.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4bf10e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_sized_output = concat_gdf_sized.copy()[['PLACE','BUA Size Classification','OPENAI PLACE POINT PERCEPTION','official_bua_centroid_point','distance']]\n",
    "concat_gdf_sized_output.columns = ['ONS BUA PLACE NAME', 'ONS BUA SIZE CLASSIFICATION', 'OPENAI POINT', 'ONS BUA CENTROID POINT', 'DISTANCE (METRES)']\n",
    "# concat_gdf_sized_output.to_csv(\"2512_full_output_data_geographic_bias_task.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a92a9c58",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scale up figure sizes\n",
    "plt.rcParams.update({'font.size': 20})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a19586b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ERROR DISTANCE KDE DISTRIBUTIONS SUBFIGURE\n",
    "plt.figure(figsize=(12,6))\n",
    "sns.kdeplot(data=concat_gdf_sized[concat_gdf_sized['BUA Size Classification']==\"Minor\"].reset_index(drop=True), x='distance', color=\"#0072B2\", label='Minor', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2.5, linestyle='--')\n",
    "sns.kdeplot(data=concat_gdf_sized[concat_gdf_sized['BUA Size Classification']==\"Small\"].reset_index(drop=True), x='distance', color=\"#56B4E9\", label='Small', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2, linestyle='--')\n",
    "sns.kdeplot(data=concat_gdf_sized[concat_gdf_sized['BUA Size Classification']==\"Medium\"].reset_index(drop=True), x='distance', color=\"#999999\", label='Medium', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2, linestyle='--')\n",
    "sns.kdeplot(data=concat_gdf_sized[concat_gdf_sized['BUA Size Classification']==\"Large\"].reset_index(drop=True), x='distance', color=\"#009E73\", label='Large', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2, linestyle='--')\n",
    "sns.kdeplot(data=concat_gdf_sized[concat_gdf_sized['BUA Size Classification']==\"Major\"].reset_index(drop=True), x='distance', color=\"#CC79A7\", label='Major', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2, linestyle='--')\n",
    "plt.legend()\n",
    "plt.xlim(0,12000)\n",
    "plt.ylabel(\"Density\\n\")\n",
    "plt.xlabel(\"\\nError Distance (m)\")\n",
    "# plt.title(\"Error Distance KDEs per BUA Size Class\\n\", fontsize=16)\n",
    "# plt.savefig(\"2512_subfigure_error_distances_kdes_per_bua_size_class.png\", dpi=600, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c76a92d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ERROR DISTANCE KDE ALL CLASS SIZES TOTAL DISTRIBUTION SUBFIGURE\n",
    "plt.figure(figsize=(12,6))\n",
    "sns.kdeplot(data=concat_gdf_sized, x='distance', color=\"black\", label='All', cut=0, bw_adjust=0.5, clip=(0, None), linewidth=2.5, linestyle='--')\n",
    "plt.legend()\n",
    "plt.xlim(0,1200000)\n",
    "plt.ylabel(\"Density\\n\")\n",
    "plt.xlabel(\"\\nError Distance (m)\")\n",
    "# plt.savefig(\"2512_subfigure_error_distances_kde_total.png\", dpi=600, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5be84219",
   "metadata": {},
   "outputs": [],
   "source": [
    "uk_regions_gdf = gpd.read_file(\"Data/uk_regions_data/Counties_and_Unitary_Authorities_December_2024_Boundaries_UK_BFC.gpkg\")[['CTYUA24CD', 'CTYUA24NM', 'geometry']]\n",
    "ew_regions_gdf = uk_regions_gdf[uk_regions_gdf['CTYUA24CD'].str[0].isin(['E','W'])].reset_index(drop=True)\n",
    "ew_regions_gdf['nation_code'] = ew_regions_gdf['CTYUA24CD'].str[0]\n",
    "ew_regions_gdf_dissolved = ew_regions_gdf.dissolve(by='nation_code').reset_index(drop=True)\n",
    "ew_regions_gdf_dissolved['nation'] = ['England', 'Wales']\n",
    "ew_regions_gdf_dissolved = ew_regions_gdf_dissolved[['geometry','nation']]\n",
    "ew_regions_gdf_dissolved.plot('nation');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55ed263c",
   "metadata": {},
   "outputs": [],
   "source": [
    "ew_regions_gdf_dissolved"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a99dbc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# NATIONAL DIFFERENCES: ENGLAND VS WALES\n",
    "concat_gdf_sized_ew = concat_gdf_sized.to_crs('EPSG:4326').sjoin(ew_regions_gdf.to_crs('EPSG:4326'), how='inner', predicate='intersects')\n",
    "concat_gdf_sized_ew_aggregations = concat_gdf_sized_ew.groupby('nation_code').median('distance').reset_index()[['nation_code', 'distance']]\n",
    "concat_gdf_sized_ew_aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c31fa61",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_sized_ew_aggregations = concat_gdf_sized_ew.groupby('nation_code').mean('distance').reset_index()[['nation_code', 'distance']]\n",
    "concat_gdf_sized_ew_aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cf3d03a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# MEAN REGIONAL DIFFERENCES\n",
    "concat_gdf_sized_ew_regions = concat_gdf_sized.to_crs('EPSG:4326').sjoin(ew_regions_gdf.to_crs('EPSG:4326'), how='inner', predicate='intersects')\n",
    "concat_gdf_sized_ew_regions = concat_gdf_sized_ew_regions.groupby(['CTYUA24NM']).mean('distance').reset_index()[['CTYUA24NM', 'distance']].sort_values('distance').reset_index(drop=True)\n",
    "concat_gdf_sized_ew_counties = concat_gdf_sized_ew[['CTYUA24NM','geometry']].set_geometry('geometry').dissolve(by='CTYUA24NM').reset_index().merge(concat_gdf_sized_ew_regions).set_crs(\"EPSG:4326\")\n",
    "concat_gdf_sized_ew_counties[['CTYUA24NM', 'distance']].to_csv(\"2512_mean_distances_by_county_ew.csv\", index=False)\n",
    "# concat_gdf_sized_ew_counties.explore('distance').save(\"2512_mean_distances_by_county_ew.html\")\n",
    "\n",
    "# MEDIAN REGIONAL DIFFERENCES\n",
    "concat_gdf_sized_ew_regions = concat_gdf_sized.to_crs('EPSG:4326').sjoin(ew_regions_gdf.to_crs('EPSG:4326'), how='inner', predicate='intersects')\n",
    "concat_gdf_sized_ew_regions = concat_gdf_sized_ew_regions.groupby(['CTYUA24NM']).median('distance').reset_index()[['CTYUA24NM', 'distance']].sort_values('distance').reset_index(drop=True)\n",
    "concat_gdf_sized_ew_counties = concat_gdf_sized_ew[['CTYUA24NM','geometry']].set_geometry('geometry').dissolve(by='CTYUA24NM').reset_index().merge(concat_gdf_sized_ew_regions).set_crs(\"EPSG:4326\")\n",
    "concat_gdf_sized_ew_counties[['CTYUA24NM', 'distance']].to_csv(\"2512_median_distances_by_county_ew.csv\", index=False)\n",
    "# concat_gdf_sized_ew_counties.explore('distance').save(\"2512_median_distances_by_county_ew.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab9b4bfc",
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_errors_regions = pd.read_csv(\"2512_mean_distances_by_county_ew.csv\")\n",
    "mean_errors_regions.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abcd9f81",
   "metadata": {},
   "outputs": [],
   "source": [
    "median_errors_regions = pd.read_csv(\"2512_median_distances_by_county_ew.csv\")\n",
    "median_errors_regions.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2aa546c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(concat_gdf_sized_ew_counties.crs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8cf2a9b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_sized_ew_regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bc65051",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf = concat_gdf.to_crs('EPSG:4326').set_geometry('OPENAI PLACE POINT PERCEPTION').to_crs('EPSG:4326')\n",
    "concat_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73a85003",
   "metadata": {},
   "outputs": [],
   "source": [
    "# now round for visualisation purposes\n",
    "concat_gdf['distance'] = concat_gdf['distance'].round(0) # 0 dp to get whole figrue in 'metres'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c95d3991",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_pydeck_layer_df(df, model_name):\n",
    "    df.loc[:,'lat_i'] = [df['official_bua_centroid_point'][i].coords[0][0] if (not df['official_bua_centroid_point'][i] == None) else None for i in range(0, len(df['official_bua_centroid_point']))]\n",
    "    df.loc[:,'lng_i'] = [df['official_bua_centroid_point'][i].coords[0][1] if (not df['official_bua_centroid_point'][i] == None) else None for i in range(0, len(df['official_bua_centroid_point']))]\n",
    "    df.loc[:,'lat_j'] = [df['OPENAI PLACE POINT PERCEPTION'][i].coords[0][0] if (not df['OPENAI PLACE POINT PERCEPTION'][i] == None) else None for i in range(0, len(df['OPENAI PLACE POINT PERCEPTION']))]\n",
    "    df.loc[:,'lng_j'] = [df['OPENAI PLACE POINT PERCEPTION'][i].coords[0][1] if (not df['OPENAI PLACE POINT PERCEPTION'][i] == None) else None for i in range(0, len(df['OPENAI PLACE POINT PERCEPTION']))]\n",
    "    df.loc[:,'model_name'] = model_name\n",
    "    df = df[['lat_i','lng_i', 'lat_j', 'lng_j', 'distance', 'model_name', 'PLACE']]\n",
    "    df.loc[:,'distance_m'] = df['distance'].astype('str')\n",
    "    return df\n",
    "\n",
    "concat_gpt4o_mini_pydeck = generate_pydeck_layer_df(concat_gdf, \"GPT-4o Mini\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcfb18a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pydeck as pdk\n",
    "\n",
    "arc_layer_4o_mini = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck,\n",
    "    get_width=\"2\",\n",
    "    get_source_position=\"[lat_i, lng_i]\",\n",
    "    get_target_position=\"[lat_j, lng_j]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[238, 232, 220, 220],\n",
    "    get_target_color=[215, 48, 39, 200],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "view_state = pdk.ViewState(latitude=53.00, longitude=-1.00, bearing=10, pitch=45, zoom=6)\n",
    "\n",
    "TOOLTIP_TEXT = {\"html\": \"Place: <strong>{PLACE}</strong></br>Model: <strong>{model_name}</strong></br>Distance: <strong>{distance_m}m</strong>\"}\n",
    "\n",
    "overall_r = pdk.Deck([arc_layer_4o_mini], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "overall_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_gpt_point_to_bua_centroid_distance_comparisons.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20a2d345",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf_concise = bua_2022_gdf[['Country', 'Region', 'BUA Code', 'BUA Name', 'BUA Size Classification']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9df2ef4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "full_gdf_to_filter = pd.concat([bua_2022_gdf_concise, concat_gdf], axis=1)\n",
    "full_gdf_to_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b49e51de",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_gdf_major = full_gdf_to_filter[full_gdf_to_filter['BUA Size Classification'] == \"Major\"].reset_index(drop=True)\n",
    "concat_gpt4o_mini_pydeck_major = generate_pydeck_layer_df(concat_gdf_major, \"GPT-4o Mini\")\n",
    "\n",
    "concat_gdf_large = full_gdf_to_filter[full_gdf_to_filter['BUA Size Classification'] == \"Large\"].reset_index(drop=True)\n",
    "concat_gpt4o_mini_pydeck_large = generate_pydeck_layer_df(concat_gdf_large, \"GPT-4o Mini\")\n",
    "\n",
    "concat_gdf_medium = full_gdf_to_filter[full_gdf_to_filter['BUA Size Classification'] == \"Medium\"].reset_index(drop=True)\n",
    "concat_gpt4o_mini_pydeck_medium = generate_pydeck_layer_df(concat_gdf_medium, \"GPT-4o Mini\")\n",
    "\n",
    "concat_gdf_small = full_gdf_to_filter[full_gdf_to_filter['BUA Size Classification'] == \"Small\"].reset_index(drop=True)\n",
    "concat_gpt4o_mini_pydeck_small = generate_pydeck_layer_df(concat_gdf_small, \"GPT-4o Mini\")\n",
    "\n",
    "concat_gdf_minor = full_gdf_to_filter[full_gdf_to_filter['BUA Size Classification'] == \"Minor\"].reset_index(drop=True)\n",
    "concat_gpt4o_mini_pydeck_minor = generate_pydeck_layer_df(concat_gdf_minor, \"GPT-4o Mini\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "811341ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\\nMAJOR\", concat_gdf_major['distance'].describe())\n",
    "print(\"\\n\\nLARGE\", concat_gdf_large['distance'].describe())\n",
    "print(\"\\n\\nMEDIUM\", concat_gdf_medium['distance'].describe())\n",
    "print(\"\\n\\nSMALL\", concat_gdf_small['distance'].describe())\n",
    "print(\"\\n\\nMINOR\", concat_gdf_minor['distance'].describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "160a24fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "view_state = pdk.ViewState(latitude=51.74, longitude=-0.60, bearing=-10, pitch=80, zoom=6)\n",
    "TOOLTIP_TEXT = {\"html\": \"Place: <strong>{PLACE}</strong></br>Model: <strong>{model_name}</strong></br>Distance: <strong>{distance_m}m</strong>\"}\n",
    "\n",
    "\n",
    "arc_layer_4o_mini_major = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck_major,\n",
    "    get_width=\"1.46\",\n",
    "    get_source_position=\"[lat_j, lng_j]\",\n",
    "    get_target_position=\"[lat_i, lng_i]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[215, 48, 39, 200],\n",
    "    get_target_color=[238, 232, 220, 220],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "major_r = pdk.Deck([arc_layer_4o_mini_major], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "major_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_1major_gpt_point_to_bua_centroid_distance_comparisons.html\")\n",
    "\n",
    "arc_layer_4o_mini_large = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck_large,\n",
    "    get_width=\"1.33\",\n",
    "    get_source_position=\"[lat_j, lng_j]\",\n",
    "    get_target_position=\"[lat_i, lng_i]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[215, 48, 39, 200],\n",
    "    get_target_color=[238, 232, 220, 220],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "large_r = pdk.Deck([arc_layer_4o_mini_large], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "large_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_2large_gpt_point_to_bua_centroid_distance_comparisons.html\")\n",
    "\n",
    "arc_layer_4o_mini_medium = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck_medium,\n",
    "    get_width=\"1.21\",\n",
    "    get_source_position=\"[lat_j, lng_j]\",\n",
    "    get_target_position=\"[lat_i, lng_i]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[215, 48, 39, 200],\n",
    "    get_target_color=[238, 232, 220, 220],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "medium_r = pdk.Deck([arc_layer_4o_mini_medium], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "medium_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_3medium_gpt_point_to_bua_centroid_distance_comparisons.html\")\n",
    "\n",
    "arc_layer_4o_mini_small = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck_small,\n",
    "    get_width=\"1.1\",\n",
    "    get_source_position=\"[lat_j, lng_j]\",\n",
    "    get_target_position=\"[lat_i, lng_i]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[215, 48, 39, 200],\n",
    "    get_target_color=[238, 232, 220, 220],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "small_r = pdk.Deck([arc_layer_4o_mini_small], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "small_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_4small_gpt_point_to_bua_centroid_distance_comparisons.html\")\n",
    "\n",
    "arc_layer_4o_mini_minor = pdk.Layer(\n",
    "    \"ArcLayer\",\n",
    "    data=concat_gpt4o_mini_pydeck_minor,\n",
    "    get_width=\"1\",\n",
    "    get_source_position=\"[lat_j, lng_j]\",\n",
    "    get_target_position=\"[lat_i, lng_i]\",\n",
    "    get_tilt=0,\n",
    "    get_source_color=[215, 48, 39, 200],\n",
    "    get_target_color=[238, 232, 220, 220],\n",
    "    pickable=True,\n",
    "    auto_highlight=True,\n",
    ")\n",
    "minor_r = pdk.Deck([arc_layer_4o_mini_minor], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "minor_r.to_html(\"2510_pydeck_arc_visualisation_gpt4o_mini_5minor_gpt_point_to_bua_centroid_distance_comparisons.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2b1fcb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "overall_weighted_r = pdk.Deck([arc_layer_4o_mini_minor, arc_layer_4o_mini_small, arc_layer_4o_mini_medium, arc_layer_4o_mini_large, arc_layer_4o_mini_major], initial_view_state=view_state, tooltip=TOOLTIP_TEXT)\n",
    "overall_weighted_r.to_html(\"2512_pydeck_arc_visualisation_gpt4o_mini_gpt_point_to_bua_centroid_distance_comparisons_all_weighted.html\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
